%matplotlib inline
import pandas as pd
import numpy as np
import random
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
# load data and get some insight
pdata = pd.read_csv("concrete.csv")
pdata.describe().transpose()
sns.boxplot(pdata['age'])
# the above analysis suggest that we have some outliers
pdata.age.fillna(pdata.age.median(), inplace = True)
sns.boxplot(pdata['slag'])
# the above analysis suggest that we have some outliers
pdata.slag.fillna(pdata.slag.median(), inplace = True)
sns.boxplot(pdata['superplastic'])
# the above analysis suggest that we have some outliers
pdata.superplastic.fillna(pdata.superplastic.median(), inplace = True)
pdata.dropna(inplace = True)
print(pdata.shape)
pdata.head()
# check for null value
pdata.isnull().sum()
Base on the sumarize above, we do not see we have any bad or missing data.
unique_values = pdata.nunique()
print('Count unique values in each column')
print(unique_values)
sns.distplot(pdata.strength, kde=False)
sns.distplot(pdata.slag, kde=False)
sns.distplot(pdata.ash, kde=False)
sns.distplot(pdata.superplastic, kde=False)
pdata.superplastic.describe()
sns.distplot(pdata.coarseagg, kde=False)
sns.distplot(pdata.fineagg, kde=False)
sns.distplot(pdata.age, kde=False)
We see the data is not too smooth and some outliers
sns.distplot(pdata.cement, kde=False)
sns.distplot(pdata.water, kde=False)
We see some ouliers here
pdata.hist(bins=20, figsize=(16,12))
plt.show()
corr = pdata.corr()
sns.heatmap(corr, annot=True)
Base on above plot, we can see cement, superplastic and age are those major factors impact strength
sns.pairplot(pdata)
Base on above plot, we can see strength increase when cement, age, and superplastic increase
X = pdata.drop('strength', axis=1)
Y = pdata['strength']
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
X_train.head()
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
def label_feature(bars):
for bar in bars:
height = bar.get_height()
ax.annotate('{:.2f}'.format(height), xy=(bar.get_x() + bar.get_width() / 2, height), ha='center', va='bottom')
features = ["cement", "slag", "ash", "water", "superplastic", "coarseagg", "fineagg", "age"]
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, Y_train)
Y_pred_lr = lr.predict(X_test)
score1 = lr.score(X_test, Y_test)
score1
x = np.arange(len(features))
width = 0.3
fig, ax = plt.subplots(figsize=(8,6))
bars = ax.bar(x, lr.coef_, width)
ax.set_ylabel('importances')
ax.set_xlabel('features')
ax.set_xticks(x)
ax.set_xticklabels(features, rotation=90)
label_feature(bars)
fig.tight_layout()
plt.show()
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, Y_train)
Y_pred_rf = rf.predict(X_test)
score2 = rf.score(X_test, Y_test)
print(rf.get_params())
score2
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()
dt.fit(X_train, Y_train)
Y_pred_dt = dt.predict(X_test)
score3 = dt.score(X_test, Y_test)
print(dt.get_params())
score3
fig, ax = plt.subplots(figsize=(10,6))
dt_bars = ax.bar(x-(width/2), dt.feature_importances_, width, label='Decision Tree')
rf_bars = ax.bar(x+(width/2), rf.feature_importances_, width, label='Random Forest')
ax.set_ylabel('importance')
ax.set_xlabel('features')
ax.set_xticks(x)
ax.set_xticklabels(features, rotation=90)
ax.legend(loc="upper left", bbox_to_anchor=(1,1))
label_feature(dt_bars)
label_feature(rf_bars)
plt.show()
from sklearn.model_selection import GridSearchCV
rf_param_grid = {'bootstrap': [True],
'max_depth': [8, 9, 10, 11],
'max_features': [2, 3],
'min_samples_leaf': [3, 4, 5],
'min_samples_split': [8, 10, 12],
'n_estimators': [10, 20, 30, 100]
}
gscv_rf = GridSearchCV(rf, rf_param_grid, cv=10)
gscv_rf.fit(X_train,Y_train)
print("r square for Random Forest: {}".format(gscv_rf.best_score_))
print("best hyperparameters for Random Forest: {}".format(gscv_rf.best_params_))
dt_param_grid = {"criterion": ["mse", "mae"],
"min_samples_split": [2, 4, 8],
"max_depth": [2, 6, 8],
"min_samples_leaf": [2, 4, 10],
"max_leaf_nodes": [10, 40, 400],
}
gscv_dt = GridSearchCV(dt, dt_param_grid, cv=10)
gscv_dt.fit(X_train,Y_train)
print("r square for Decision Trees: {}".format(gscv_dt.best_score_))
print("best hyperparameters for Decision Trees: {}".format(gscv_dt.best_params_))
def evaluate(model, x, y):
predictions = model.predict(x)
errors = abs(predictions - y)
mape = 100 * np.mean(errors / y)
accuracy = 100 - mape
print('Average Error = {:0.4f} degrees.'.format(np.mean(errors)))
print('Accuracy = {:0.2f}%.'.format(accuracy))
return accuracy
# Random Forest evaluate
rf_accuracy = evaluate(rf, X_test, Y_test)
# Random Forest after tuning
rf_tunning_best = gscv_rf.best_estimator_
rf_tunning_accuracy = evaluate(rf_tunning_best, X_test, Y_test)
# Decision Tree evaluate
dt_accuracy = evaluate(dt, X_test, Y_test)
# Decision Tree after tuning
dt_tunning_best = gscv_dt.best_estimator_
dt_tunning_accuracy = evaluate(dt_tunning_best, X_test, Y_test)